import pandas as pd
import numpy as np
import lightgbm as lgb
from polynomial_boosting import PolynomialBoostingModel
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
Gradient Boosting with Polynomial Decision Trees
Fit an arbitrarily complex Polynomial in Decision Tree leaf nodes, rather than a flat line. Code for polynomial_boosting
can be found here.
123)
np.random.seed(
def f(X):
return X**2
= 1000
N
= np.random.uniform(size=(N,1)) * 6 - 3
X
#model requires that `y` is a matrix and not a vector (contrary to sklearn models)
= f(X) + np.random.normal(size=(N,1)) * 0.5
y
= np.linspace(-3,3,250)
line
0],y[:,0],s=1)
plt.scatter(X[:,="red") plt.plot(line,f(line),c
= PolynomialBoostingModel(1, #polynomial_level
model 0.25, #learning rate
5.0, #regression regularization
100, #n_trees
10, #min_samples_leaf
1.0, #goss alpha
0.0, #goss beta
1) #random seed
model.fit_fast(X,y)
= model.predict_fast(line.reshape(-1,1)).reshape(-1)
preds
0],y[:,0],s=0.05)
plt.scatter(X[:,="red")
plt.plot(line,f(line),c="green") plt.plot(line,preds,c
= GradientBoostingRegressor(max_depth=1, n_estimators=100, learning_rate = 0.25)
model2 -1))
model2.fit(X,y.reshape(
= model2.predict(line.reshape(-1,1)).reshape(-1)
preds2
0],y[:,0],s=0.05)
plt.scatter(X[:,="red")
plt.plot(line,f(line),c="green") plt.plot(line,preds2,c
= lgb.Dataset(X, label=y.reshape(-1), params={'linear_tree': True})
lgb_dataset
= {
lgb_params "objective": "regression",
"metric": "l2",
"num_iterations": 100,
"num_leaves": 2,
"learning_rate": 0.25,
"linear_lambda": 5.0,
"min_data_in_leaf": 10,
"verbosity": -1
}
= lgb.train(lgb_params, lgb_dataset)
model3
= model3.predict(line.reshape(-1,1)).reshape(-1)
preds3
0],y[:,0],s=0.05)
plt.scatter(X[:,="red")
plt.plot(line,f(line),c="green") plt.plot(line,preds3,c
/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/lightgbm/engine.py:172: UserWarning: Found `num_iterations` in params. Will use it instead of argument
_log_warning(f"Found `{alias}` in params. Will use it instead of argument")
= plt.subplots(nrows=1, ncols=3, figsize=(12, 6))
fig, axes
0].scatter(X[:,0], y[:,0], color='blue', label='Training Data', s=0.1)
axes[0].plot(line, f(line), color='green', label='True Function')
axes[0].plot(line, preds, color='red', label='Predicted')
axes[0].set_title('Polynomial (linear) Boosting - MSE: {:.4f}'.format(np.mean((preds-f(line))**2)))
axes[0].set_xlabel('X')
axes[0].set_ylabel('Y')
axes[0].legend()
axes[
1].scatter(X[:,0], y[:,0], color='blue', label='Training Data', s=0.1)
axes[1].plot(line, f(line), color='green', label='True Function')
axes[1].plot(line, preds2, color='red', label='Predicted')
axes[1].set_title('Sklearn Gradient Boosting - MSE: {:.4f}'.format(np.mean((preds2-f(line))**2)))
axes[1].set_xlabel('X')
axes[1].set_ylabel('Y')
axes[1].legend()
axes[
2].scatter(X[:,0], y[:,0], color='blue', label='Training Data', s=0.1)
axes[2].plot(line, f(line), color='green', label='True Function')
axes[2].plot(line, preds3, color='red', label='Predicted')
axes[2].set_title('LGBM Linear Boosting - MSE: {:.4f}'.format(np.mean((preds3-f(line))**2)))
axes[2].set_xlabel('X')
axes[2].set_ylabel('Y')
axes[2].legend()
axes[
plt.tight_layout()
= fetch_california_housing(return_X_y=True)
Xd, yd
#small train set, as the model is still quite slow
= train_test_split(Xd, yd, test_size=0.975, random_state=123)
X_train, X_test, y_train, y_test
= np.mean(X_train,0).reshape(1,-1)
Xm = np.std(X_train,0).reshape(1,-1)
Xs
= np.mean(y_train)
ym = np.std(y_train)
ys
= (X_train - Xm) / Xs
X_train = (y_train - ym) / ys
y_train
= (X_test - Xm) / Xs X_test
= PolynomialBoostingModel(1, #polynomial level
d_model 0.25, #learning rate
5.0, #regression regularization
100, #n_trees
10, #min_samples_leaf
1.0, #goss alpha
0.0, #goss beta
1) #random seed
-1,1))
d_model.fit_fast(X_train,y_train.reshape(
= d_model.predict_fast(X_test).reshape(-1) * ys + ym d_preds
= GradientBoostingRegressor(max_depth=1, n_estimators=100, learning_rate = 0.25)
d_model2
d_model2.fit(X_train,y_train)
= d_model2.predict(X_test) * ys + ym d_preds2
= lgb.Dataset(X_train, label=y_train.reshape(-1), params={'linear_tree': True})
lgb_dataset
= {
lgb_params "objective": "regression",
"metric": "l2",
"num_iterations": 100,
"num_leaves": 2,
"learning_rate": 0.25,
"linear_lambda": 5.0,
"min_data_in_leaf": 10,
"verbosity": -1
}
= lgb.train(lgb_params, lgb_dataset)
d_model3
= d_model3.predict(X_test).reshape(-1) * ys + ym d_preds3
/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/lightgbm/engine.py:172: UserWarning: Found `num_iterations` in params. Will use it instead of argument
_log_warning(f"Found `{alias}` in params. Will use it instead of argument")
-y_test)**2) np.mean((d_preds
0.40638200743172254
-y_test)**2) np.mean((d_preds2
0.4231840786265534
-y_test)**2) np.mean((d_preds3
0.41076314048606744
= fetch_california_housing()["feature_names"]
feature_names = pd.Series(d_model.get_feature_importances()[:8], index = feature_names)
importances
=False).plot(kind="bar")
importances.sort_values(ascending plt.tight_layout()
= pd.Series(d_model2.feature_importances_, index = feature_names)
importances2
=False).plot(kind="bar")
importances2.sort_values(ascending plt.tight_layout()
= pd.Series(d_model3.feature_importance()/np.sum(d_model3.feature_importance()), index=feature_names)
importances3
=False).plot(kind="bar")
importances3.sort_values(ascending plt.tight_layout()
Larger dataset
= fetch_california_housing(return_X_y=True)
Xd, yd
#small train set, as the model is still quite slow
= train_test_split(Xd, yd, test_size=0.2, random_state=123)
X_train, X_test, y_train, y_test
= np.mean(X_train,0).reshape(1,-1)
Xm = np.std(X_train,0).reshape(1,-1)
Xs
= np.mean(y_train)
ym = np.std(y_train)
ys
= (X_train - Xm) / Xs
X_train = (y_train - ym) / ys
y_train
= (X_test - Xm) / Xs X_test
Use GOSS sampling to keep computational demand feasible (see the LightGBM paper, algorithm 2)
Currently, not using specific weights for the randomly sampled subset in GOSS appears to outperform the exact GOSS proposal. This might be due to the regression models not being fit with those weights as well but only the total node losses.
= PolynomialBoostingModel(
d_model 1, #polynomial level
0.25, #learning rate
5.0, #regression regularization
100, #n_trees
10, #min_samples_leaf
0.2, #goss alpha
0.025, #goss beta
1) #random seed
-1,1))
d_model.fit_fast(X_train,y_train.reshape(
= d_model.predict_fast(X_test).reshape(-1) * ys + ym d_preds
= GradientBoostingRegressor(max_depth=1, n_estimators=100, learning_rate = 0.25, min_samples_leaf=10)
d_model2
d_model2.fit(X_train,y_train)
= d_model2.predict(X_test) * ys + ym d_preds2
= lgb.Dataset(X_train, label=y_train.reshape(-1), params={'linear_tree': True})
lgb_dataset
= {
lgb_params "objective": "regression",
"metric": "l2",
"num_iterations": 100,
"num_leaves": 2,
"learning_rate": 0.25,
"linear_lambda": 5.0,
"data_sampling_strategy": "goss",
"top_rate": 0.2, #=goss alpha
"other_rate": 0.025, #=goss beta
"verbosity": -1
}
= lgb.train(lgb_params, lgb_dataset)
d_model3
= d_model3.predict(X_test).reshape(-1) * ys + ym d_preds3
/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/lightgbm/engine.py:172: UserWarning: Found `num_iterations` in params. Will use it instead of argument
_log_warning(f"Found `{alias}` in params. Will use it instead of argument")
= PolynomialBoostingModel(
d_model4 2, #polynomial level
0.25, #learning rate
5.0, #regression regularization
100, #n_trees
10, #min_samples_leaf
0.2, #goss alpha
0.025, #goss beta
1) #random seed
-1,1))
d_model4.fit_fast(X_train,y_train.reshape(
= d_model4.predict_fast(X_test).reshape(-1) * ys + ym d_preds4
-y_test)**2) #boosted linear trees (p=1) np.mean((d_preds
0.34790501763806014
-y_test)**2) #sklearn boosted trees (p=0) np.mean((d_preds2
0.3797511677878719
-y_test)**2) #lgbm boosted trees (p=1) np.mean((d_preds3
0.3640894459649956
-y_test)**2) #boosted quadratic trees (p=2) np.mean((d_preds4
0.33865369672689777
= fetch_california_housing()["feature_names"]
feature_names = pd.Series(d_model.get_feature_importances()[:8], index = feature_names)
importances
=False).plot(kind="bar")
importances.sort_values(ascending plt.tight_layout()
= pd.Series(d_model2.feature_importances_, index = feature_names)
importances2
=False).plot(kind="bar")
importances2.sort_values(ascending plt.tight_layout()
= pd.Series(d_model3.feature_importance()/np.sum(d_model3.feature_importance()), index=feature_names)
importances3
=False).plot(kind="bar")
importances3.sort_values(ascending plt.tight_layout()
= pd.Series(d_model4.get_feature_importances()[:8], index = feature_names)
importances4
=False).plot(kind="bar")
importances4.sort_values(ascending plt.tight_layout()